library(arrow)
## 
## Attaching package: 'arrow'
## The following object is masked from 'package:utils':
## 
##     timestamp
library(fs)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::duration() masks arrow::duration()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ dplyr::lag()          masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:arrow':
## 
##     schema
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(leaflet)
library(leaflet.extras)
library(rlang)
## 
## Attaching package: 'rlang'
## 
## The following objects are masked from 'package:purrr':
## 
##     %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
##     flatten_raw, invoke, splice
## 
## The following object is masked from 'package:arrow':
## 
##     string
options(dplyr.summarise.inform = FALSE)

#==================================#
# 1. LOAD DATA
# Load CSV/Parquet files into Polars DataFrames
# If Parquet exists, read it directly; otherwise read CSV, write Parquet, then read
#==================================#
csv_to_parquet <- function(csv_dir, parquet_dir, tables) {
  csv_dir <- path(csv_dir)
  parquet_dir <- path(parquet_dir)
  dir_create(parquet_dir)
  
  dfs <- list()
  
  for (table in tables) {
    parquet_path <- path(parquet_dir, paste0(table, ".parquet"))
    
    if (!file_exists(parquet_path)) {
      message("Creating parquet for: ", table)
      
      open_dataset(
        path(csv_dir, paste0(table, ".csv")),
        format = "csv"
      ) |>
        write_dataset(
          parquet_dir,
          format = "parquet",
          basename_template = paste0(table, ".parquet")
        )
    } 
    
    # read parquet
    dfs[[table]] <- read_parquet(parquet_path)
    message(
      table, ": ",
      nrow(dfs[[table]]), " rows, ",
      ncol(dfs[[table]]), " cols"
    )
  }
  dfs
}

tables <- c(
  "flights_sample_3m",
  "airports_clean"
)

dfs <- csv_to_parquet(
  csv_dir = "../../data/csv",
  parquet_dir = "../../data/parquet",
  tables = tables
)
## flights_sample_3m: 3000000 rows, 32 cols
## airports_clean: 7697 rows, 6 cols
df_flights <- dfs$flights_sample_3m
df_airports <- dfs$airports_clean

#==================================#
# 2. FEATURE ENGINEERING & DATA CLEANING
# 2.1 Feature Engineering
# Time & Distance Features
#==================================#
dow_levels <- c("Monday", "Tuesday", "Wednesday", "Thursday",
                "Friday", "Saturday", "Sunday")

df_flights <- df_flights |>
  mutate(
    FL_DATE = ymd(FL_DATE),
    YEAR = year(FL_DATE),
    MONTH = month(FL_DATE),
    QUARTER = quarter(FL_DATE),
    DAY_OF_WEEK = wday(FL_DATE), 
    
    DISTANCE_CAT = cut(
      DISTANCE,
      breaks = c(-Inf, 500, 1500, Inf),
      labels = c("Short-haul", "Medium-haul", "Long-haul")
    ),
    
    SEASON = case_when(
      MONTH %in% c(12, 1, 2) ~ "Winter",
      MONTH %in% c(3, 4, 5) ~ "Spring",
      MONTH %in% c(6, 7, 8) ~ "Summer",
      TRUE ~ "Autumn"
    ),
    ROUTE = paste0(ORIGIN_CITY, " → ", DEST_CITY),
    DEP_HOUR = as.integer(CRS_DEP_TIME %/% 100)
  )

#----------------------------------#
# Adding Geolocation Columns
#----------------------------------#
airports_ll <- df_airports |>
  filter(!is.na(IATA), IATA != "") |>
  select(IATA, LAT, LON)

df_flights <- df_flights |>
  left_join(airports_ll, join_by(ORIGIN == IATA)) |>
  rename(
    ORIGIN_LAT = LAT,
    ORIGIN_LON = LON
  ) |>
  left_join(airports_ll, join_by(DEST == IATA)) |>
  rename(
    DEST_LAT = LAT,
    DEST_LON = LON
  )

#==================================#
# 2.2 Check for missing values
#==================================#
check_missing_cols <- function(df){
  missing_cols <- c()
  missing_counts <- c()
  no_missing_cols <- c()
  
  for(col in names(df)){
    na_count <- sum(is.na(df[[col]]))
    if(na_count > 0){
      missing_cols <- c(missing_cols, col)
      missing_counts <- c(missing_counts, na_count)
    }
    else{
      no_missing_cols <- c(no_missing_cols, col)
    }
  }
  
  cat("Columns WITH missing values:\n")
  if (length(missing_cols) > 0) {
    for (i in seq_along(missing_cols)) {
      cat("-", missing_cols[i], ":", missing_counts[i], "\n")
    }
  } else {
    cat("None\n")
  }
  
  cat("\nColumns WITHOUT missing values:\n")
  if (length(no_missing_cols) > 0) {
    print(no_missing_cols)
  } else {
    cat("None\n")
  }
}

check_missing_cols(df_flights)
## Columns WITH missing values:
## - DEP_TIME : 77615 
## - DEP_DELAY : 77644 
## - TAXI_OUT : 78806 
## - WHEELS_OFF : 78806 
## - WHEELS_ON : 79944 
## - TAXI_IN : 79944 
## - ARR_TIME : 79942 
## - ARR_DELAY : 86198 
## - CANCELLATION_CODE : 2920860 
## - CRS_ELAPSED_TIME : 14 
## - ELAPSED_TIME : 86198 
## - AIR_TIME : 86198 
## - DELAY_DUE_CARRIER : 2466137 
## - DELAY_DUE_WEATHER : 2466137 
## - DELAY_DUE_NAS : 2466137 
## - DELAY_DUE_SECURITY : 2466137 
## - DELAY_DUE_LATE_AIRCRAFT : 2466137 
## - ORIGIN_LAT : 735 
## - ORIGIN_LON : 735 
## - DEST_LAT : 762 
## - DEST_LON : 762 
## 
## Columns WITHOUT missing values:
##  [1] "FL_DATE"      "AIRLINE"      "AIRLINE_DOT"  "AIRLINE_CODE" "DOT_CODE"    
##  [6] "FL_NUMBER"    "ORIGIN"       "ORIGIN_CITY"  "DEST"         "DEST_CITY"   
## [11] "CRS_DEP_TIME" "CRS_ARR_TIME" "CANCELLED"    "DIVERTED"     "DISTANCE"    
## [16] "YEAR"         "MONTH"        "QUARTER"      "DAY_OF_WEEK"  "DISTANCE_CAT"
## [21] "SEASON"       "ROUTE"        "DEP_HOUR"
#----------------------------------#
# Remove na
#----------------------------------#
# df_flights <- df_flights |>
#   drop_na(ORIGIN_LAT, ORIGIN_LON,
#           DEST_LAT, DEST_LON)

#==================================#
# 2.3 Format numbers
#==================================#
format_compact <- function(n) {
  case_when(
    n >= 1e6 ~ sub("\\.0M$", "M", sprintf("%.2fM", n / 1e6)),
    n >= 1e3 ~ sub("\\.0K$", "K", sprintf("%.2fK", n / 1e3)),
    TRUE ~ as.character(n)
  )
}

#==================================#
# 3. EDA
# 3.1 Homepage
# Summary (Group_by --> Filter last year & first year)
#==================================#
pct_vs_baseline <- function(df, year, baseline, col) {
  n_year <- df |>
    filter(YEAR == year) |>
    pull({{col}})
  
  n_base <- df |>
    filter(YEAR == baseline) |>
    pull({{col}})
  
  round((n_year - n_base) / n_base * 100, 2)
}

get_operated_flights <- function(df){
  df |> filter(CANCELLED == 0 & DIVERTED == 0)
}
  
count_by_year <- function(df, condition = NULL, col_name = "n") {
  condition <- enquo(condition)
  
  if (!quo_is_null(condition)) {
    df <- filter(df, !!condition)
  }
  
  df |>
    group_by(YEAR) |>
    summarise(
      !!col_name := n(),
      .groups = "drop"
    ) |>
    arrange(YEAR)
}

ontime_delay_by <- function(df, group_cols){
  df_op <- get_operated_flights(df)
  
  df_op |>
    mutate(
      n_ontime = as.integer(DEP_DELAY <= 15),
      n_delayed = as.integer(DEP_DELAY > 15),
    ) |>
    group_by(across(all_of(group_cols))) |>
    summarise(
      n_ontime = sum(n_ontime, na.rm = TRUE),
      n_delayed = sum(n_delayed, na.rm = TRUE),
      n_operated = n(),
      avg_delay = mean(DEP_DELAY, na.rm = TRUE),
      .groups = "drop"
    ) |>
    mutate(
      pct_ontime  = n_ontime / n_operated * 100,
      pct_delayed = n_delayed / n_operated * 100
    ) |>
    arrange(across(all_of(group_cols)))
}

#----------------------------------#
flight_yearly <- count_by_year(df_flights, col_name="n_flights")

df_cancel <- count_by_year(
  df_flights,
  CANCELLED == 1,
  col_name = "n_cancelled"
)

df_divert <- count_by_year(
  df_flights,
  DIVERTED == 1,
  col_name = "n_diverted"
)

df_operated <- count_by_year(
  df_flights,
  CANCELLED == 0 & DIVERTED == 0,
  col_name = "n_operated"
)

df_ontime_delay <- ontime_delay_by(df_flights, "YEAR")

pct_23_19_flight <- pct_vs_baseline(flight_yearly, 2023, 2019, n_flights)

pct_23_19_cancel <- pct_vs_baseline(df_cancel, 2023, 2019, n_cancelled)
pct_23_19_divert <- pct_vs_baseline(df_divert, 2023, 2019, n_diverted)
pct_23_19_operated <- pct_vs_baseline(df_operated, 2023, 2019, n_operated)

pct_23_19_ontime <- pct_vs_baseline(df_ontime_delay, 2023, 2019, n_ontime)
pct_23_19_delayed <- pct_vs_baseline(df_ontime_delay, 2023, 2019, n_delayed)

total_flights <- nrow(df_flights)
total_airlines <- n_distinct(df_flights$AIRLINE)
total_airports <- n_distinct(df_flights$ORIGIN)
total_routes <- nrow(df_flights |>
                       distinct(ORIGIN, DEST))

total_flights_fmt <- format_compact(total_flights)

cat(
  "U.S. Flight Operations: 2019–2023\n",
  "Flights volume vs 2019: ", pct_23_19_flight, "\n",
  "Cancelled vs 2019: ", pct_23_19_cancel, "\n",
  "Diverted vs 2019: ", pct_23_19_divert, "\n",
  "Operated vs 2019: ", pct_23_19_operated, "\n",
  "  - On-time vs 2019: ", pct_23_19_ontime, "\n",
  "  - Delayed vs 2019: ", pct_23_19_delayed, "\n",
  "Total flights: ", total_flights_fmt, "\n",
  "Total airlines: ", total_airlines, "\n",
  "Total airports: ", total_airports, "\n",
  "Total routes: ", total_routes, "\n"
)
## U.S. Flight Operations: 2019–2023
##  Flights volume vs 2019:  -38.83 
##  Cancelled vs 2019:  -42.56 
##  Diverted vs 2019:  -36.41 
##  Operated vs 2019:  -38.77 
##    - On-time vs 2019:  -41.79 
##    - Delayed vs 2019:  -24.99 
##  Total flights:  3.00M 
##  Total airlines:  18 
##  Total airports:  380 
##  Total routes:  7785
#==================================#
# Chart: Weekly Distribution
#==================================#
dow_map <- c(
  "1" = "Monday", "2" = "Tuesday", "3" = "Wednesday",
  "4" = "Thursday", "5" = "Friday",
  "6" = "Saturday", "7" = "Sunday"
)

df_dow <- df_flights |>
  group_by(YEAR, DAY_OF_WEEK) |>
  summarise(n_flights = n(), .groups = "drop") |>
  mutate(
    DAY  = dow_map[as.character(DAY_OF_WEEK)],
    YEAR = as.character(YEAR)
  ) |>
  arrange(YEAR, DAY_OF_WEEK)

year_colors <- c(
  "2019" = "#47c6d1",
  "2020" = "#de506f",
  "2021" = "#e657e3",
  "2022" = "#deab4e",
  "2023" = "#4df06b"
)

fig_1 <- plot_ly(
  df_dow,
  x = ~DAY, y = ~n_flights,
  color = ~YEAR, colors = year_colors,
  type = "bar", opacity = 0.8
) |>
  layout(
    barmode = "stack",
    showlegend = FALSE,
    xaxis = list(title = ""),
    yaxis = list(title = "", showgrid = FALSE),
    title = list(
      text = paste0(
        "WEEKLY DISTRIBUTION",
        "<br><span style='font-size:14px;color:#666;font-style:italic'>",
        "Number of flights</span>"
      ),
      x = 0.5,
      xanchor = "center",
      font = list(size = 16, color = "#333", family = "Comic Sans MS")
    ),
    margin = list(l = 10, r = 10, t = 70, b = 10),
    paper_bgcolor = "#f5f5f5",
    plot_bgcolor  = "#f5f5f5"
  ) |>
  config(responsive = TRUE)

fig_1
#==================================#
# Chart: Airline Rankings
#==================================#
df_air_rank <- df_flights |>
  group_by(AIRLINE) |>
  summarise(n_flights = n(), .groups = "drop") |>
  arrange(desc(n_flights)) |>
  mutate(
    # ép factor trước khi plot để reorder
    AIRLINE = factor(AIRLINE, levels = AIRLINE),
    text = sapply(n_flights, format_compact)
  )

fig_2 <- plot_ly(
  data = df_air_rank,
  x = ~n_flights,
  y = ~AIRLINE,
  type = "bar",
  orientation = "h",
  marker = list(color = "#2dd4bf"),
  hovertemplate = "<b>%{y}</b><br>%{customdata} flights<extra></extra>",
  customdata = df_air_rank$text   # số đã format (3.2M)
) |> 
  layout(
    yaxis = list(
      autorange = "reversed",
      title = NULL,
      color = "#F1F5F9"
    ),
    xaxis = list(
      showgrid = FALSE,
      title = NULL,
      color = "#F1F5F9"
    ),
    title = list(
      text = paste0(
        "AIRLINE RANKING",
        "<br><span style='font-size:14px;color:#666;font-style:italic'>Flight Counts</span>"
      ),
      x = 0.5,
      xanchor = "center",
      font = list(size = 16, color = "#333", family = "Comic Sans MS")
    ),
    margin = list(l = 200, r = 50, t = 70, b = 10),
    paper_bgcolor = "rgba(0,0,0,0)",
    plot_bgcolor = "rgba(0,0,0,0)"
  ) |>
  config(responsive = TRUE)

fig_2
#==================================#
# Chart: Monthly Departures
#==================================#
df_monthly_departures <- df_flights |>
  group_by(MONTH) |>
  summarise(n_flights = n(), .groups = "drop") |>
  arrange(MONTH) |>
  mutate(text = sapply(n_flights, format_compact))

fig_3 <- plot_ly(
  df_monthly_departures,
  x = ~MONTH, y = ~n_flights,
  type = "scatter",
  mode = "lines+markers+text",
  text = ~text,
  textposition = "top center",
  cliponaxis = FALSE,
  opacity = 0.8,
  line = list(color = "#c82fd6", width = 3),
  marker = list(size = 8),
  height = 400
) |>
  layout(
    yaxis = list(title = ""),
    xaxis = list(
      title = "",
      showgrid = FALSE,
      type = "category"   # giống fig.update_xaxes(type="category")
    ),
    title = list(
      text = paste0(
        "MONTHLY DEPARTURES",
        "<br><span style='font-size:14px;color:#666;font-style:italic'>",
        "by number of flights</span>"
      ),
      x = 0.5,
      xanchor = "center",
      font = list(size = 16, color = "#333", family = "Comic Sans MS")
    ),
    margin = list(l = 10, r = 10, t = 70, b = 10),
    paper_bgcolor = "#f5f5f5",
    plot_bgcolor  = "#f5f5f5"
  ) |>
  config(responsive = TRUE)

fig_3
#==================================#
# Chart: Flight Distance Distribution
#==================================#
df_distance_departures <- df_flights |>
  group_by(DISTANCE_CAT) |>
  summarise(n_flights = n(), .groups = "drop") |>
  arrange(DISTANCE_CAT)

fig_4 <- plot_ly(
  data = df_distance_departures
) |>
  add_bars(
    x = ~DISTANCE_CAT,
    y = ~n_flights,
    color = ~DISTANCE_CAT,
    colors = c("#f76af7", "#e8b76d", "#51d6b3"),
    opacity = 0.8
  ) |>
  layout(
    showlegend = FALSE,
    xaxis = list(title = ""),
    yaxis = list(title = ""),
    title = list(
      text = paste0(
        "FLIGHT DISTANCE DISTRIBUTION",
        "<br><span style='font-size:14px;color:#666;font-style:italic'>",
        "by number of flights</span>"
      ),
      x = 0.5,
      xanchor = "center",
      font = list(size = 16, color = "#333", family = "Comic Sans MS")
    ),
    margin = list(l = 10, r = 10, t = 70, b = 10),
    paper_bgcolor = "#f5f5f5",
    plot_bgcolor  = "#f5f5f5"
  ) |>
  config(responsive = TRUE)

fig_4
#==================================#
# Chart: National Coverage
#==================================#
df_flights <- df_flights |>
  mutate(
    ORIGIN_AIRPORT = paste0(ORIGIN_CITY, " (", ORIGIN, ")")
  )

df_origin_airport <- df_flights |>
  filter(!is.na(ORIGIN_LAT) & !is.na(ORIGIN_LON)) |>   # loại bỏ dòng NA
  group_by(ORIGIN_AIRPORT, ORIGIN_LAT, ORIGIN_LON) |>
  summarise(n_flights = n(), .groups = "drop") |>
  mutate(text = sapply(n_flights, format_compact))

pal <- colorNumeric(
  palette = c("#3b82f6", "#22c55e", "#f59e0b", "#ef4444"),
  domain = df_origin_airport$n_flights
)

radius_fun <- function(x) pmin(5, log(x + 1) * 4)

origin_map <- leaflet(df_origin_airport) |>
  addProviderTiles(providers$CartoDB.Positron) |>
  setView(
    lng = mean(df_origin_airport$ORIGIN_LON),
    lat = mean(df_origin_airport$ORIGIN_LAT),
    zoom = 3
  ) |>

  # 🔵 layer 1: halo mờ (nhòe)
  addCircleMarkers(
    lng = ~ORIGIN_LON,
    lat = ~ORIGIN_LAT,
    radius = ~radius_fun(n_flights) * 1.8,
    fillColor = ~pal(n_flights),
    fillOpacity = 0.25,
    stroke = FALSE
  ) |>

  # 🔴 layer 2: bubble chính (rõ)
  addCircleMarkers(
    lng = ~ORIGIN_LON,
    lat = ~ORIGIN_LAT,
    radius = ~radius_fun(n_flights),
    fillColor = ~pal(n_flights),
    fillOpacity = 0.75,
    stroke = FALSE,
    label = ~paste0(ORIGIN_AIRPORT, "<br>", text, " flights")
  )

origin_map
#==================================#
# 3.2 Operational Performance
# 3.2.1 Overview + With/Without Filter
# On-time rate = Number of on-time flights / Total number of operated flights * 100
# Delay rate = Number of delayed flights / Total number of operated flights * 100

# This is often simply "100% - Ontime Rate - Canceled Rate - Divert Rate",
# as every flight falls into one of these categories. The average delay time in
# minutes is another common metric.

# Note: Canceled and diverted flights are typically excluded from the on-time
# performance calculation but included in the total pool of flight for overall performance metrics.

# Cancelled rate = Number of canceled flights / Total number of scheduled flights * 100
# Divert rate = Number of diverted flights / Total number of scheduled flights * 100
#==================================#
filter_by_year <- function(df, year) {
  if (!is.null(year)) {
    df <- filter(df, YEAR == year)
  }
  return(df)
}

filter_by_airline <- function(df, airline) {
  if (!is.null(airline)) {
    df <- filter(df, AIRLINE == airline)
  }
  return(df)
}

filter_by_origin_airport <- function(df, airport) {
  if (!is.null(airport)) {
    df <- filter(df, ORIGIN == airport)
  }
  return(df)
}

filter_by_season <- function(df, season) {
  if (!is.null(season)) {
    df <- filter(df, SEASON == season)
  }
  return(df)
}

#==================================#
# On-time Performance & Delays
#==================================#
ontime_delay_summary <- function(df, year = NULL, airline = NULL) {

  # Filter theo điều kiện (nếu có)
  if (!is.null(year)) {
    df <- df |> filter(YEAR == year)
  }

  if (!is.null(airline)) {
    df <- df |> filter(AIRLINE == airline)
  }

  total_flights <- nrow(df)

  # Cancelled & Diverted
  total_cancelled <- df |>
    filter(CANCELLED == 1) |> nrow()

  total_diverted <- df |>
    filter(DIVERTED == 1) |> nrow()

  # Operated flights
  total_operated <- total_flights - total_cancelled - total_diverted

  # On-time flights (<= 15 mins)
  total_on_time <- df |>
    filter(
      DEP_DELAY <= 15,
      CANCELLED == 0,
      DIVERTED == 0
    ) |> nrow()

  total_delayed <- total_operated - total_on_time

  # Tỷ lệ %
  pct_cancelled <- round(total_cancelled / total_flights * 100, 2)
  pct_diverted <- round(total_diverted / total_flights * 100, 2)
  pct_operated <- round(total_operated / total_flights * 100, 2)
  pct_on_time <- round(total_on_time / total_operated * 100, 2)
  pct_delayed <- round(total_delayed / total_operated * 100, 2)

  # Output
  cat("On-time Performance & Delays\n")

  if (is.null(year) && is.null(airline)) {
    cat("Years: 2019–2023 | Airlines: All airlines\n\n")
  } else {
    cat("Years:", year, "| Airlines:", airline, "\n\n")
  }
  
  total_flights_fmt <- format_compact(total_flights)
  
  cat("Total flights:", total_flights_fmt, "\n")
  cat("Cancelled:", pct_cancelled, "%\n")
  cat("Diverted:", pct_diverted, "%\n")
  cat("Operated flights:", pct_operated, "%\n")
  cat("  - On-time:", pct_on_time, "% of operated\n")
  cat("  - Delayed:", pct_delayed, "% of operated\n")
}

ontime_delay_summary(df_flights)
## On-time Performance & Delays
## Years: 2019–2023 | Airlines: All airlines
## 
## Total flights: 3.00M 
## Cancelled: 2.64 %
## Diverted: 0.24 %
## Operated flights: 97.13 %
##   - On-time: 82.47 % of operated
##   - Delayed: 17.53 % of operated
ontime_delay_summary(df_flights, year = 2019)
## On-time Performance & Delays
## Years: 2019 | Airlines: 
## 
## Total flights: 757.67K 
## Cancelled: 1.79 %
## Diverted: 0.26 %
## Operated flights: 97.94 %
##   - On-time: 82.01 % of operated
##   - Delayed: 17.99 % of operated
ontime_delay_summary(df_flights, year = 2019, airline = "United Air Lines Inc.")
## On-time Performance & Delays
## Years: 2019 | Airlines: United Air Lines Inc. 
## 
## Total flights: 63.57K 
## Cancelled: 0.84 %
## Diverted: 0.33 %
## Operated flights: 98.83 %
##   - On-time: 81.46 % of operated
##   - Delayed: 18.54 % of operated
#==================================#
# Chart: MONTHLY DISTRIBUTION OF ON-TIME & DELAYED FLIGHTS
#==================================#
monthly_ontime_delay <- function(df, year = NULL, airline = NULL) {
  
  if (!is.null(year)) {
    df <- filter_by_year(df, year)
  }
  
  if (!is.null(airline)) {
    df <- filter_by_airline(df, airline)
  }
  
  df_monthly <- ontime_delay_by(df, group_cols = "MONTH")
  
  fig <- plot_ly(
    data = df_monthly,
    x = ~MONTH, y = ~n_ontime,
    type = "bar",
    name = "On-time",
    marker = list(color = "#61c9c4")
  ) |>
    add_trace(
      y = ~n_delayed,
      name = "Delayed",
      marker = list(color = "#c3d15c")
    ) |>
    layout(
      barmode = "stack",
      showlegend = FALSE,
      xaxis = list(
        title = "Month",
        dtick = 1
      ),
      yaxis = list(
        title = "Number of operated flights"
      ),
      title = list(
        text = paste0(
          "MONTHLY DISTRIBUTION OF ON-TIME & DELAYED FLIGHTS",
          "<br><span style='font-size:14px;color:#666;font-style:italic'>",
          "Operated Flight Volume</span>"
        ),
        x = 0.5,
        xanchor = "center",
        font = list(size = 16, color = "#333", family = "Comic Sans MS")
      ),
      margin = list(l = 10, r = 10, t = 70, b = 10),
      paper_bgcolor = "#f5f5f5",
      plot_bgcolor = "#f5f5f5"
    )|>
    config(responsive = TRUE)
  fig
}

monthly_ontime_delay(df_flights)
monthly_ontime_delay(df_flights, year = 2019)
monthly_ontime_delay(df_flights, year = 2019, airline = "United Air Lines Inc.")
#==================================#
# Chart: MONTHLY FLIGHT VOLUME VS AVERAGE DELAY
#==================================#
monthly_volume_delay <- function(df, year = NULL, airline = NULL) {
  
  if (!is.null(year)) {
    df <- filter_by_year(df, year)
  }
  
  if (!is.null(airline)) {
    df <- filter_by_airline(df, airline)
  }
  
  df_monthly_perf <- ontime_delay_by(df, group_cols = "MONTH") |>
    select(MONTH, n_operated, avg_delay) |>
    mutate(
      text = format_compact(n_operated),
      avg_delay = round(avg_delay, 1)
    )
  
  fig <- plot_ly()
  
  # --- Bar: number of flights ---
  fig <- fig |>
    add_bars(
      data = df_monthly_perf,
      x = ~MONTH, y = ~n_operated,
      name = "Number of Flights",
      yaxis = "y",
      text = ~text,
      textposition = "outside",
      opacity = 0.65,
      marker = list(color = "#D97706"),
      hovertemplate = paste(
        "<b>Month:</b> %{x}<br>",
        "<b>Flights:</b> %{y:,}<br>",
        "<extra></extra>"
      )
    )
  
  # --- Line: average delay ---
  fig <- fig |>
    add_trace(
      data = df_monthly_perf,
      x = ~MONTH, y = ~avg_delay,
      name = "Avg Departure Delay (min)",
      type = "scatter",
      mode = "lines+markers",
      yaxis = "y2",
      line = list(color = "#2563EB", width = 2, dash = "dash"),
      marker = list(size = 8)
    )
  
  fig <- fig |>
    layout(
      xaxis = list(
        type = "category",
        showgrid = FALSE
      ),
      yaxis = list(
        title = "Number of Flights",
        showgrid = FALSE
      ),
      yaxis2 = list(
        title = "Avg Delay (minutes)",
        overlaying = "y",
        side = "right",
        showgrid = FALSE
      ),
      showlegend = FALSE,
      title = list(
        text = "MONTHLY FLIGHT VOLUME VS AVERAGE DELAY",
        x = 0.5,
        xanchor = "center",
        font = list(size = 16, color = "#333", family = "Comic Sans MS")
      ),
      margin = list(l = 10, r = 10, t = 70, b = 10),
      paper_bgcolor = "#f5f5f5",
      plot_bgcolor = "#f5f5f5"
    )
  fig
}

monthly_volume_delay(df_flights)
monthly_volume_delay(df_flights, year = 2019)
monthly_volume_delay(df_flights, year = 2019, airline = "United Air Lines Inc.")
#==================================#
# Chart: DELAY CAUSES
#==================================#
delay_causes <- function(df, year = NULL, airline = NULL) {
  
  if (!is.null(year)) {
    df <- filter_by_year(df, year)
  }
  
  if (!is.null(airline)) {
    df <- filter_by_airline(df, airline)
  }
  
  delay_cols <- c(
    "DELAY_DUE_CARRIER",
    "DELAY_DUE_WEATHER",
    "DELAY_DUE_NAS",
    "DELAY_DUE_SECURITY",
    "DELAY_DUE_LATE_AIRCRAFT"
  )
  
  cause_map <- c(
    "DELAY_DUE_LATE_AIRCRAFT" = "Late Aircraft",
    "DELAY_DUE_NAS" = "NAS",
    "DELAY_DUE_CARRIER" = "Carrier",
    "DELAY_DUE_WEATHER" = "Weather",
    "DELAY_DUE_SECURITY" = "Security"
  )
  
  df_delay_causes <- df |>
    select(all_of(delay_cols)) |>
    summarise(across(everything(), \(x) sum(x, na.rm = TRUE))) |>
    pivot_longer(
      everything(),
      names_to = "cause",
      values_to = "delay_minutes"
    ) |>
    mutate(
      pct = delay_minutes / sum(delay_minutes) * 100,
      label = cause_map[cause],
      n_blocks = round(pct)
    )
  
  # ---- Fix rounding so total = 100 ----
  diff <- 100 - sum(df_delay_causes$n_blocks)
  if (diff != 0) {
    idx <- which.max(df_delay_causes$n_blocks)
    df_delay_causes$n_blocks[idx] <- df_delay_causes$n_blocks[idx] + diff
  }
  
  df_delay_causes <- df_delay_causes |>
    arrange(desc(pct))
  
  # ---- Build waffle layout ----
  waffle <- unlist(
    mapply(
      function(label, n) rep(label, n),
      df_delay_causes$label,
      df_delay_causes$n_blocks,
      SIMPLIFY = FALSE
    )
  )
  
  rows <- 5
  cols <- 20
  x <- rep(seq_len(cols) - 1, times = rows)
  y <- rep(rev(seq_len(rows) - 1), each = cols)
  
  color_map <- c(
    "Late Aircraft" = "#FDBA74",
    "NAS" = "#93C5FD",
    "Carrier" = "#86EFAC",
    "Weather" = "#FCA5A5",
    "Security" = "#D1D5DB"
  )
  
  fig <- plot_ly(height = 320)
  
  for (cause in names(color_map)) {
    
    mask <- waffle == cause
    pct <- df_delay_causes$pct[df_delay_causes$label == cause][1]
    
    fig <- fig |>
      add_trace(
        x = x[mask], y = y[mask],
        type = "scatter",
        mode = "markers",
        name = cause,
        marker = list(
          size = 22,
          symbol = "square",
          color = color_map[[cause]],
          line = list(width = 1, color = "#f5f5f5")
        ),
        hovertemplate = paste0(
          "<b>", cause, "</b><br>",
          sprintf("%.1f", pct), "% of total delay<br>",
          "<extra></extra>"
        )
      )
  }
  
  fig <- fig |>
    layout(
      title = list(
        text = paste0(
          "Flight Delay Breakdown by Causes",
          "<br><span style='font-size:14px;color:#666;font-style:italic'>",
          "Share of Total Delay Minutes</span>"
        ),
        x = 0.5,
        xanchor = "center"
      ),
      legend = list(
        orientation = "h",
        yanchor = "top", y = 1.2,
        xanchor = "center", x = 0.5,
        font = list(size = 12)
      ),
      margin = list(l = 20, r = 20, t = 140, b = 20),
      xaxis = list(visible = FALSE, range = c(-0.5, 19.5)),
      yaxis = list(visible = FALSE, range = c(-0.5, 4.5)),
      paper_bgcolor = "#f5f5f5",
      plot_bgcolor = "#f5f5f5",
      showlegend = TRUE
    )|>
    config(responsive = TRUE)
  fig
}

delay_causes(df_flights)
delay_causes(df_flights, year = 2019)
delay_causes(df_flights, year = 2019, airline = "United Air Lines Inc.")
#==================================#
# 3.2.2 Local Patterns + With/Without Filter
# Summary
#==================================#
local_patterns <- function(df, year = NULL, airline = NULL,
                           origin_airport = NULL, season = NULL) {
  
  if (!is.null(year)) {
    df <- filter_by_year(df, year)
  }
  
  if (!is.null(airline)) {
    df <- filter_by_airline(df, airline)
  }
  
  if (!is.null(origin_airport)) {
    df <- filter_by_origin_airport(df, origin_airport)
  }
  
  if (!is.null(season)) {
    df <- filter_by_season(df, season)
  }
  
  total_flights <- nrow(df)
  
  df_operated <- df |>
    filter(CANCELLED == 0, DIVERTED == 0)
  
  avg_dep <- df_operated |>
    summarise(avg = mean(DEP_DELAY, na.rm = TRUE)) |>
    pull(avg) |>
    round(2)
  
  avg_arr <- df_operated |>
    summarise(avg = mean(ARR_DELAY, na.rm = TRUE)) |>
    pull(avg) |>
    round(2)
  
  total_flights_fmt <- format_compact(total_flights)
  
  cat("Total flights:", total_flights_fmt, "\n")
  cat("Dep Delay:", avg_dep, "minutes\n")
  cat("Arr Delay:", avg_arr, "minutes\n")
}

local_patterns(df_flights)
## Total flights: 3.00M 
## Dep Delay: 10.05 minutes
## Arr Delay: 4.26 minutes
local_patterns(df_flights, year = 2019)
## Total flights: 757.67K 
## Dep Delay: 10.73 minutes
## Arr Delay: 5.31 minutes
local_patterns(df_flights, year = 2019, airline = "United Air Lines Inc.")
## Total flights: 63.57K 
## Dep Delay: 12.32 minutes
## Arr Delay: 7.66 minutes
local_patterns(df_flights, year = 2019, airline = "United Air Lines Inc.", 
               origin_airport = "DEN")
## Total flights: 6.92K 
## Dep Delay: 11.99 minutes
## Arr Delay: 7.99 minutes
local_patterns(df_flights, year = 2019, airline = "United Air Lines Inc.", 
               origin_airport = "DEN", season = "Summer")
## Total flights: 1.87K 
## Dep Delay: 19.28 minutes
## Arr Delay: 15.3 minutes
#==================================#
# Table: AIRPORT DELAY STABILITY
#==================================#
airport_delay_stability <- function(df, min_flights = 1000, year = NULL,
                                    airline = NULL, season = NULL) {
  
  if (!is.null(year)) {
    df <- filter_by_year(df, year)
  }
  
  if (!is.null(airline)) {
    df <- filter_by_airline(df, airline)
  }
  
  if (!is.null(season)) {
    df <- filter_by_season(df, season)
  }
  
  df_airport <- df |>
    filter(CANCELLED == 0, DIVERTED == 0) |>
    group_by(ORIGIN) |>
    summarise(
      n_flights = n(),
      avg_arr_delay = mean(ARR_DELAY, na.rm = TRUE),
      std_arr_delay = sd(ARR_DELAY, na.rm = TRUE),
      .groups = "drop"
    ) |>
    filter(n_flights >= min_flights) |>
    arrange(desc(std_arr_delay))
  
  return(df_airport)
}

df_stability = airport_delay_stability(df_flights, year = 2019)
head(df_stability, 10)
## # A tibble: 10 × 4
##    ORIGIN n_flights avg_arr_delay std_arr_delay
##    <chr>      <int>         <dbl>         <dbl>
##  1 MSN         1428         10.2           81.0
##  2 SDF         2489          7.75          80.1
##  3 FAT         1234          6.53          72.0
##  4 SJU         2619          9.07          70.4
##  5 LIT         1373         10.1           69.9
##  6 CHS         2568          6.33          69.4
##  7 RIC         2394          5.28          68.8
##  8 HPN         1149          9.92          67.2
##  9 ELP         1700          4.76          66.5
## 10 PBI         2566          8.33          65.4
#==================================#
# Table: ROUTING RANKING
#==================================#
routing_ranking <- function(df, year = NULL, airline = NULL, season = NULL) {
  
  if (!is.null(year)) {
    df <- filter_by_year(df, year)
  }
  
  if (!is.null(airline)) {
    df <- filter_by_airline(df, airline)
  }
  
  if (!is.null(season)) {
    df <- filter_by_season(df, season)
  }
  
  # lọc flight hợp lệ
  df <- filter(
    df,
    CANCELLED == 0,
    DIVERTED == 0
  )
  
  # group & summarize
  result <- df |>
    group_by(ROUTE) |>
    summarise(
      n_flights = n(),
      avg_arr_delay = mean(ARR_DELAY, na.rm = TRUE),
      .groups = "drop"
    ) |>
    arrange(desc(n_flights))
  
  return(result)
}

df_route = routing_ranking(df_flights, year = 2019, airline = "United Air Lines Inc.")
head(df_route, 10)
## # A tibble: 10 × 3
##    ROUTE                               n_flights avg_arr_delay
##    <chr>                                   <int>         <dbl>
##  1 Newark, NJ → San Francisco, CA            499         13.9 
##  2 San Francisco, CA → Newark, NJ            491         10.4 
##  3 Washington, DC → Chicago, IL              468          4.11
##  4 Chicago, IL → Washington, DC              442          6.55
##  5 Chicago, IL → San Francisco, CA           435         14.1 
##  6 Newark, NJ → Los Angeles, CA              435         15.3 
##  7 Chicago, IL → New York, NY                423         14.0 
##  8 San Francisco, CA → Chicago, IL           420         17.7 
##  9 New York, NY → Chicago, IL                403         15.5 
## 10 Los Angeles, CA → San Francisco, CA       390          1.54
#==================================#
# Chart: TIME OF DAY 
# --> AVG ARRIVAL DELAY BY DEPARTURE HOUR
#==================================#
time_of_day <- function(df, year = NULL, airline = NULL, season = NULL) {
  
  data <- df
  
  if (!is.null(year)) {
    data <- filter_by_year(data, year)
  }
  
  if (!is.null(airline)) {
    data <- filter_by_airline(data, airline)
  }
  
  if (!is.null(season)) {
    data <- filter_by_season(data, season)
  }
  
  # ---- Aggregate ----
  delay_avg_hourly <- data |>
    group_by(YEAR, DEP_HOUR) |>
    summarise(
      Avg_Departure_Delay = round(mean(DEP_DELAY, na.rm = TRUE), 1),
      .groups = "drop"
    ) |>
    arrange(YEAR, DEP_HOUR)
  
  # ---- Plot ----
  fig <- plot_ly(
    data = delay_avg_hourly,
    x = ~DEP_HOUR,
    y = ~Avg_Departure_Delay,
    color = ~as.factor(YEAR),
    type = "scatter",
    mode = "lines+markers",
    line = list(shape = "hv")  # step-like
  )
  
  fig <- fig |>
    layout(
      showlegend = FALSE,
      xaxis = list(
        dtick = 1,
        type = "category"
      ),
      yaxis = list(title = NULL),
      title = list(
        text = "AVERAGE DEPARTURE DELAY BY TIME OF DAY",
        x = 0.5
      ),
      margin = list(l = 10, r = 10, t = 70, b = 10),
      paper_bgcolor = "#f5f5f5",
      plot_bgcolor = "#f5f5f5",
      title = list(
        text = "AVERAGE DEPARTURE DELAY BY TIME OF DAY",
        x = 0.5,
        font = list(
          size = 16,
          color = "#333",
          family = "Comic Sans MS"
        )
      )
    ) |>
    config(responsive = TRUE)
  fig
}

time_of_day(df_flights, year = 2019, airline = "United Air Lines Inc.")
#==================================#
# Map: Arrival delay by airport
#==================================#
arrival_delay_folium <- function(
    df,
    year = NULL,
    airline = NULL,
    season = NULL,
    volume_quantile = 0.7,
    tiles = "CartoDB.Positron"
) {
  
  if (!is.null(year)) df <- filter_by_year(df, year)
  if (!is.null(airline)) df <- filter_by_airline(df, airline)
  if (!is.null(season)) df <- filter_by_season(df, season)
  
  df <- df |>
    filter(
      CANCELLED == 0,
      DIVERTED == 0,
      !is.na(ARR_DELAY),
      !is.na(DEST_LAT),
      !is.na(DEST_LON)
    )
  
  airport_perf <- df |>
    group_by(DEST, DEST_LAT, DEST_LON) |>
    summarise(
      n_flights = n(),
      med_arr_delay = round(median(ARR_DELAY), 1),
      .groups = "drop"
    )
  
  q <- quantile(airport_perf$n_flights, volume_quantile)
  airport_perf <- airport_perf |> filter(n_flights >= q)
  
  pal <- colorNumeric(
    palette = c("yellow", "blue", "red"),
    domain = airport_perf$med_arr_delay
  )
  
  radius_fun <- function(x) scales::rescale(x, to = c(6, 14))
  
  leaflet(airport_perf) |>
    addProviderTiles(tiles) |>
    
    addCircleMarkers(
      lng = ~DEST_LON,
      lat = ~DEST_LAT,
      radius = ~radius_fun(n_flights),
      fillColor = ~pal(med_arr_delay),
      fillOpacity = 0.75,
      color = NA,
      
      popup = ~paste0(
        "<b>", DEST, "</b><br>",
        "Median delay: ", med_arr_delay, " min<br>",
        "Flights: ", format(n_flights, big.mark = ",")
      ),
      
      clusterOptions = markerClusterOptions(
        disableClusteringAtZoom = 5,
        maxClusterRadius = 60,
        spiderfyOnMaxZoom = TRUE
      )
    )
}

m = arrival_delay_folium(df_flights, year = 2019, season="Winter")
m
#==================================#
# 3.2.3 Local Patterns + With/Without Filter
# - Performance is evaluated relative to peer airlines under the selected filters.
# - Results reflect comparative delay behavior rather than overall service quality.
# Chart: INFLUENCE OF VARIOUS DELAYS
#==================================#
influence_of_delays <- function(df, year = NULL, airline = NULL, season = NULL) {
  
  delay_cols <- c(
    "DEP_DELAY",
    "ARR_DELAY",
    "DELAY_DUE_CARRIER",
    "DELAY_DUE_WEATHER",
    "DELAY_DUE_NAS",
    "DELAY_DUE_SECURITY",
    "DELAY_DUE_LATE_AIRCRAFT"
  )
  
  if (!is.null(year)) 
    df <- filter_by_year(df, year)
  
  if (!is.null(airline)) 
    df <- filter_by_airline(df, airline)
  
  if (!is.null(season)) 
    df <- filter_by_season(df, season)
  
  df <- df |> filter(DEP_DELAY > 0)
  
  values <- df |>
    select(all_of(delay_cols)) |>
    summarise(across(everything(), \(x) sum(x, na.rm = TRUE))) |>
    unlist(use.names = FALSE)
  
  rel <- values / max(values, na.rm = TRUE)
  z <- scales::rescale(rel, to = c(0, 1))
  
  # ---- màu Spectral giống matplotlib ----
  pal <- grDevices::colorRampPalette(
    RColorBrewer::brewer.pal(11, "Spectral")
  )(100)
  
  col_idx <- round(z * 99) + 1
  cell_colors <- pal[col_idx]
  
  rgb <- grDevices::col2rgb(cell_colors) / 255
  luminance <- 0.299 * rgb[1, ] + 0.587 * rgb[2, ] + 0.114 * rgb[3, ]
  
  text_color <- ifelse(luminance < 0.5, "white", "black")
  
  plot_df <- data.frame(
    y = factor(delay_cols, levels = rev(delay_cols)),
    x = " ",
    z = z,
    label = sprintf("%.2f", rel),
    text_color = text_color
  )
  
  title_parts <- c("Influence of Delay Factors")
  if (!is.null(year)) title_parts <- c(title_parts, paste("Year:", year))
  if (!is.null(airline)) title_parts <- c(title_parts, paste("Airline:", airline))
  if (!is.null(season)) title_parts <- c(title_parts, paste("Season:", season))
  
  fig <- plot_ly()
  
  # ---- HEATMAP (chỉ heatmap có z) ----
  fig <- fig |>
    add_heatmap(
      data = plot_df,
      x = ~x, y = ~y, z = ~z,
      colorscale = "Spectral",
      zmin = 0, zmax = 1,
      showscale = TRUE
    )
  
  # ---- TEXT TRẮNG ----
  fig <- fig |>
    add_trace(
      data = subset(plot_df, text_color == "white"),
      x = ~x, y = ~y,
      text = ~label,
      type = "scatter",
      mode = "text",
      textfont = list(color = "white", size = 12),
      showlegend = FALSE
    )
  
  # ---- TEXT ĐEN ----
  fig <- fig |>
    add_trace(
      data = subset(plot_df, text_color == "black"),
      x = ~x, y = ~y,
      text = ~label,
      type = "scatter",
      mode = "text",
      textfont = list(color = "black", size = 12),
      showlegend = FALSE
    )
  
  fig <- fig |>
    layout(
      title = paste(title_parts, collapse = " | "),
      xaxis = list(showticklabels = FALSE),
      yaxis = list(title = ""),
      margin = list(l = 120, r = 40, t = 60, b = 40)
    ) |>
    config(responsive = TRUE)
  fig
}

influence_of_delays(df_flights, year = 2019)
#==================================#
# Chart: Marginal Effect of Delay Components on Arrival Delay
#==================================#
delay_factor_interaction <- function(df, year = NULL, airline = NULL, season = NULL) {
  
  delay_cols <- c(
    "DEP_DELAY",
    "DELAY_DUE_CARRIER",
    "DELAY_DUE_WEATHER",
    "DELAY_DUE_NAS",
    "DELAY_DUE_SECURITY",
    "DELAY_DUE_LATE_AIRCRAFT"
  )
  
  if (!is.null(year)) df <- filter_by_year(df, year)
  if (!is.null(airline)) df <- filter_by_airline(df, airline)
  if (!is.null(season)) df <- filter_by_season(df, season)
  
  long_df <- df |>
    pivot_longer(
      cols = all_of(delay_cols),
      names_to = "factor",
      values_to = "delay_value"
    ) |>
    mutate(
      delay_bin = case_when(
        delay_value <= 0  ~ "0",
        delay_value <= 5  ~ "1–5",
        delay_value <= 15 ~ "5–15",
        delay_value <= 30 ~ "15–30",
        TRUE              ~ ">30"
      )
    )
  
  summary <- long_df |>
    group_by(factor, delay_bin) |>
    summarise(
      mean_arr_delay = mean(ARR_DELAY, na.rm = TRUE),
      n_flights = n(),
      .groups = "drop"
    ) |>
    filter(n_flights > 1000)
  
  summary$delay_bin <- factor(
    summary$delay_bin,
    levels = c("0", "1–5", "5–15", "15–30", ">30")
  )
  
  factors <- unique(summary$factor)
  
  plots <- lapply(factors, function(f) {
    df_f <- summary |> filter(factor == f)
    
    plot_ly(
      df_f,
      x = ~delay_bin, y = ~mean_arr_delay,
      type = "scatter",
      mode = "lines+markers",
      showlegend = FALSE
    )
  })
  
  n_cols <- 3
  n_rows <- ceiling(length(plots) / n_cols)
  
  fig <- subplot(
    plots,
    nrows = n_rows,
    shareX = TRUE,
    shareY = TRUE,
    titleX = TRUE,
    titleY = TRUE
  )
  
  # ---- thêm tiêu đề cho từng facet ----
  annotations <- lapply(seq_along(factors), function(i) {
    list(
      text = factors[i],
      x = ((i - 1) %% n_cols + 0.5) / n_cols,
      y = 1 - (floor((i - 1) / n_cols) / n_rows),
      xref = "paper",
      yref = "paper",
      showarrow = FALSE,
      font = list(size = 13)
    )
  })
  
  fig <- fig |>
    layout(
      title = "Marginal Effect of Delay Components on Arrival Delay",
      annotations = annotations,
      margin = list(t = 90)
    ) |>
    config(responsive = TRUE)
  fig
}

delay_factor_interaction(df_flights, year = 2019, season = "Spring")
#==================================#
# Chart: Reliability vs Performance of Delay Factors
#==================================#
reliability_vs_performance <- function(df, year = NULL, airline = NULL, season = NULL) {
  
  delay_cols <- c(
    "DEP_DELAY",
    "DELAY_DUE_CARRIER",
    "DELAY_DUE_WEATHER",
    "DELAY_DUE_NAS",
    "DELAY_DUE_SECURITY",
    "DELAY_DUE_LATE_AIRCRAFT"
  )
  
  if (!is.null(year)) df <- filter_by_year(df, year)
  if (!is.null(airline)) df <- filter_by_airline(df, airline)
  if (!is.null(season)) df <- filter_by_season(df, season)
  
  long_df <- df |>
    pivot_longer(
      cols = all_of(delay_cols),
      names_to = "factor",
      values_to = "delay_value"
    ) |>
    filter(delay_value > 0)
  
  summary <- long_df |>
    group_by(factor) |>
    summarise(
      mean_arr_delay = mean(ARR_DELAY, na.rm = TRUE),
      std_arr_delay  = sd(ARR_DELAY, na.rm = TRUE),
      n_flights      = n(),
      .groups = "drop"
    ) |>
    filter(n_flights > 200)
  
  fig <- plot_ly(
    data = summary,
    x = ~mean_arr_delay,
    y = ~std_arr_delay,
    type = "scatter",
    mode = "markers",
    size = ~n_flights,
    color = ~factor,
    sizes = c(10, 50),
    marker = list(opacity = 0.75),
    text = ~paste("Factor:", factor,
                  "<br>Mean:", round(mean_arr_delay,1),
                  "<br>Std:", round(std_arr_delay,1),
                  "<br>N flights:", n_flights),
    hoverinfo = "text"
  )
  
  fig <- layout(
    fig,
    title = list(text = "Reliability vs Performance of Delay Factors"),
    xaxis = list(title = "Mean Arrival Delay (minutes)"),
    yaxis = list(title = "Variability of Arrival Delay (Std Dev)"),
    legend = list(title = list(text = "Delay Factor"))
  )
  fig <- config(fig, responsive = TRUE)
}

reliability_vs_performance(df_flights, season = "Spring")

#==================================#
# 3.2.4 Discruption + With/Without Filter
# Summary
#==================================#
disruption_metrics <- function(df, year = NULL, airline = NULL, season = NULL) {
  
  if (!is.null(year)) 
    df <- filter_by_year(df, year)
  
  if (!is.null(airline)) 
    df <- filter_by_airline(df, airline)
  
  if (!is.null(season)) 
    df <- filter_by_season(df, season)
  
  total_flights <- nrow(df)
  
  summary <- df |>
    summarise(
      cancelled = sum(CANCELLED, na.rm = TRUE),
      diverted  = sum(DIVERTED, na.rm = TRUE)
    ) |>
    mutate(
      disrupted     = cancelled + diverted,
      total_flights = total_flights
    )
  
  cancelled <- summary$cancelled
  diverted  <- summary$diverted
  disrupted <- summary$disrupted
  
  cancel_rate <- round(cancelled / total_flights * 100, 2)
  divert_rate <- round(diverted / total_flights * 100, 2)
  disruption_rate <- round(disrupted / total_flights * 100, 2)
  
  # ---- in kết quả ----
  total_flights_fmt <- format(total_flights, big.mark = ",")
  
  cat("📊 Disruption Summary\n")
  cat(strrep("-", 30), "\n")
  cat("Total flights:", total_flights_fmt, "\n")
  cat("Cancelled flights:", cancel_rate, "%\n")
  cat("Diverted flights:", divert_rate, "%\n")
  cat("Total disrupted:", disruption_rate, "%\n")
}

disruption_metrics(df_flights, season="Winter")
## 📊 Disruption Summary
## ------------------------------ 
## Total flights: 732,588 
## Cancelled flights: 2.72 %
## Diverted flights: 0.22 %
## Total disrupted: 2.94 %
#==================================#
# Chart: Flight Disruption Rate Over Time (Cancelled vs Diverted)
#==================================#
plot_disruption_bar <- function(df, year = NULL, airline = NULL, season = NULL) {
  
  if (!is.null(year)) 
    df <- df |> filter(YEAR %in% year)
  
  if (!is.null(airline)) 
    df <- df |> filter(AIRLINE %in% airline)
  
  if (!is.null(season)) 
    df <- df |> filter(SEASON %in% season)
  
  df_disruption <- df |>
    mutate(
      DISRUPTION_TYPE = case_when(
        CANCELLED == 1 ~ "Cancelled",
        DIVERTED == 1  ~ "Diverted",
        TRUE ~ NA_character_
      )
    ) |>
    filter(!is.na(DISRUPTION_TYPE))
  
  df_grouped <- df_disruption |>
    group_by(YEAR, SEASON, DISRUPTION_TYPE) |>
    summarise(COUNT = n(), .groups = "drop")
  
  total_per_year <- df |>
    group_by(YEAR) |>
    summarise(TOTAL = n(), .groups = "drop")
  
  df_grouped <- df_grouped |>
    left_join(total_per_year, by = "YEAR") |>
    mutate(RATE = COUNT / TOTAL * 100)
  
  p <- ggplot(
    df_grouped,
    aes(x = YEAR, y = RATE, fill = DISRUPTION_TYPE)
  ) +
    geom_col(position = "stack") +
    facet_wrap(~ SEASON, ncol = 2) +
    scale_y_continuous(labels = scales::percent_format(scale = 1)) +
    labs(
      title = "Flight Disruption Rate by Season",
      x = "Year",
      y = "Disruption Rate (%)",
      fill = "Disruption Type"
    ) +
    theme_minimal(base_size = 13) +
    theme(
      legend.position = "bottom",
      strip.text = element_text(face = "bold"),
      plot.title = element_text(hjust = 0.5)
    )
  
  # ---------------- Convert to plotly ----------------
  ggplotly(p, tooltip = c("x", "y", "fill")) |>
    config(responsive = TRUE)
}

plot_disruption_bar(df_flights)
#==================================#
# Chart: Share of Disruption Causes
#==================================#
plot_cause_donut <- function(df, year = NULL, airline = NULL, season = NULL) {
  
  if (!is.null(year)) df <- filter_by_year(df, year)
  if (!is.null(airline)) df <- filter_by_airline(df, airline)
  if (!is.null(season)) df <- filter_by_season(df, season)
  
  causes <- c(
    "DELAY_DUE_CARRIER",
    "DELAY_DUE_WEATHER",
    "DELAY_DUE_NAS",
    "DELAY_DUE_SECURITY",
    "DELAY_DUE_LATE_AIRCRAFT"
  )
  
  cause_labels <- c(
    DELAY_DUE_CARRIER = "Delay due Carrier",
    DELAY_DUE_WEATHER = "Delay due Weather",
    DELAY_DUE_NAS = "Delay due NAS",
    DELAY_DUE_SECURITY = "Delay due Security",
    DELAY_DUE_LATE_AIRCRAFT = "Delay due Late Aircraft"
  )
  
  df_long <- df |>
    select(all_of(causes)) |>
    pivot_longer(
      cols = everything(),
      names_to = "CAUSE",
      values_to = "DELAY_MIN"
    ) |>
    filter(DELAY_MIN > 0)
  
  df_grouped <- df_long |>
    group_by(CAUSE) |>
    summarise(TOTAL_DELAY = sum(DELAY_MIN), .groups = "drop") |>
    mutate(
      RATE = TOTAL_DELAY / sum(TOTAL_DELAY) * 100,
      CAUSE_LABEL = cause_labels[CAUSE]
    ) |>
    arrange(desc(RATE))
  
  # highlight phần lớn nhất
  pull <- rep(0, nrow(df_grouped))
  pull[which.max(df_grouped$RATE)] <- 0.1
  
  fig <- plot_ly(
    data = df_grouped,
    labels = ~CAUSE_LABEL,
    values = ~RATE,
    type = "pie",
    hole = 0.5,
    pull = pull,
    textinfo = "label+percent",
    hovertemplate = "%{label}: %{value:.2f}%<extra></extra>"
  )
  
  fig <- fig |>
    layout(
      title = list(text = "Share of Disruption Causes"),
      showlegend = FALSE
    ) |>
    config(responsive = TRUE)
  fig
}

plot_cause_donut(df_flights, season="Winter")